# set working directory
setwd('~/replication_files')

# function to load and install packages
load_packages<-function(packages){
  sapply(packages,function(x){
    f<-require(x,character.only=T)
    if(!f) install.packages(x); f<-require(x,character.only=T)
    return(f)},USE.NAMES=T)
}

# load packages
load_packages(c('data.table','gtools','sandwich','miceadds'))

# format numbers to specific decimal places
specify_decimal<-function(x,ndec=3,drop0leading=F,drop0trailing=F,comma=F){
  f<-format(round(x,ndec),nsmall=ndec,drop0trailing=drop0trailing,scientific=F,big.mark=ifelse(comma,",",""))
  if(drop0leading){
    f<-sub("0.",".",f,fixed=T)
  }
  return(trimws(f))
}

# create stars depending on p-value
stars_pval<-function (p.value) {
  unclass(symnum(p.value, corr = FALSE, na = FALSE, cutpoints = c(0, 0.001, 0.01, 0.05, 1), symbols = c("***", "**", "*", "")))
}

# read data
(FinDT<-readRDS('TablesA2A3.rds'))

# dependent variables for models
DVs<-c('provax_bin','provax_count','antivax_bin','antivax_count')
# number of models
(Nmods<-length(DVs))

# independent variables to use
(IVs_main<-rep(list(c('college','female','nonwhite','age30t44','age45t59','age60p','parent','caseid_file_date_N','log(caseid_N)','log(FB_N + 1)')),4))
# add data name
(IVs_all<-lapply(IVs_main,function(x) c(x,'Data_Name')))

# create formulas
(formulas<-lapply(1:Nmods,function(x) as.formula(paste0(DVs[x],'~',paste(IVs_all[[x]],collapse='+')))))

# not sure why but running lm.cluster first outside of lapply seems to be necessary to get it to run inside lapply:
lm.cluster(data=FinDT,formula=formulas[[1]],cluster='resp_id',weights=FinDT[['weight']])

# list of model results
modlist<-lapply(1:Nmods,function(x){
  # linear model with weights and clustered on respondent id
  mod<-miceadds::lm.cluster(data=FinDT,formula=formulas[[x]],cluster='resp_id',weights=FinDT[['weight']])
  # summary
  modsum<-summary(mod)
  # coefficients
  coefs<-modsum[c(IVs_main[[x]],'(Intercept)'),]
  
  # r squared
  (R2<-specify_decimal(summary(mod$lm_res)$r.squared,3))
  # N
  (N<-paste0('\\multicolumn{1}{l}{',formatC(length(summary(mod$lm_res)$residuals),big.mark=','),'}'))
  
  # transform coefficients
  (Est<-specify_decimal(coefs[,1],3))
  # transform standard errors
  (SE<-paste0('(',specify_decimal(coefs[,2],3),')'))
  # change p value to stars
  (p<-stars_pval(coefs[,4]))
  # combine coefficient with stars
  (Estp<-paste0(Est,'{$^{',p,'}$}'))
  # data.table of results
  (dt<-data.table(rn=row.names(coefs),Estp=Estp,SE=SE))
  # change name
  setnames(dt,2:3,paste0(c('Estp','SE'),x))
  # return list of data.table, r^2, and N
  list(dt=dt,R2=R2,N=N)
})

# get data.tables from list
(dtlist<-lapply(modlist,`[[`,1))
# merge data.tables together
(dtmerge<-Reduce(function(x,y) merge(x,y,by='rn',all=T),dtlist))
# create ordering data.table
(dtorder<-data.table(rn=c('college','female','nonwhite','age30t44','age45t59','age60p','parent','caseid_file_date_N','log(caseid_N)','log(FB_N + 1)','(Intercept)')))
set(dtorder,NULL,'order',1:nrow(dtorder))

# merge model results with ordering data.table and reorder
(dtmerge<-merge(dtmerge,dtorder,by='rn'))
setorder(dtmerge,order)

# columns that need fixed
(cols2fix<-setdiff(names(dtmerge),c('rn','order')))
# replace NAs with ''
for(x in cols2fix){
  set(dtmerge,dtmerge[,.I[is.na(get(x))]],which(names(dtmerge)==x),'')
}
dtmerge

# transform into Latex table
(tabMain<-lapply(1:Nmods,function(x) c(rbind(dtmerge[[paste0('Estp',x)]],dtmerge[[paste0('SE',x)]]))))
(tabMain<-do.call(cbind,tabMain))
(tabMain<-apply(tabMain,1,function(x) paste(x,collapse=' & ')))

# coefficient names for table
tabCoefs<-c('College','~graduate','Female','','Nonwhite','','Age 30--44','','Age 45--59','','Age 60+','','Parent','','Days in','~panel','log(URLs','~visited)','log(Facebook','~pages)','Constant','')

# add coefficient names and linebreaks
(tabMain<-paste(tabCoefs,tabMain,sep=' & '))
(tabMain<-paste0(tabMain,' \\\\'))

# model r^2 and N
(modR2s<-lapply(modlist,`[[`,2))
(modNs<-lapply(modlist,`[[`,3))

# create lower part of Latex table with model diagnostics
(tabBottom<-cbind(c('$R^2$','$N$'),rbind(unlist(modR2s),unlist(modNs))))
(tabBottom<-apply(tabBottom,1,function(x) paste(x,collapse=' & ')))
(tabBottom<-paste(tabBottom,'\\\\'))

# combine together
(tabContent<-c(tabMain,'\\midrule',tabBottom))

# top of Latex table
tab_head<-c('\\begin{table}[!p]',
            '\\caption{Facebook use and online vaccine information exposure (behavioral data)}',
            '\\begin{center}',
            '\\begin{threeparttable}',
            '\\begin{tabular}{l *{4}{S[table-format=1.3,table-space-text-post={$^{***}$}]} }',
            '\\toprule',
            '& \\multicolumn{2}{c}{\\myuline{Non-skeptical webpages}} & \\multicolumn{2}{c}{\\myuline{Vaccine-skeptical webpages}} \\\\',
            '& \\multicolumn{1}{c}{Visited $\\geq 1$ page} & \\multicolumn{1}{c}{Total pages visited} & \\multicolumn{1}{c}{Visited $\\geq 1$ page} & \\multicolumn{1}{c}{Total pages visited} \\\\',
            '\\midrule')

# notes for table
tab_notes<-paste('\\item \\leavevmode\\kern-\\scriptspace\\kern-\\labelsep',
                 '$^{*} p < 0.05$, $^{**} p < .01$, $^{***} p<.001$ (two-sided); OLS models with standard errors clustered by respondent and survey weights applied. All models include panel fixed effects. Data from YouGov Pulse panel participants with online traffic data available. Each observation represents behavioral web traffic data associated with responses to one of seven national surveys conducted from 2016--2019.')

# bottom of Latex table
tab_foot<-c('\\bottomrule',
            '\\end{tabular}',
            '\\begin{tablenotes}[flushleft]',
            tab_notes,
            '\\end{tablenotes}',
            '\\end{threeparttable}',
            '\\end{center}',
            '\\label{table:taba2}',
            '\\end{table}')
# combine Latex strings together
(tab_fin<-c(tab_head,tabContent,tab_foot))
# save to file
writeLines(tab_fin,'TableA2.tex')
